In [7]:
## Data Ingestion and Processing
# This section is responsible for ingesting and processing data for the application.
from langchain_community.document_loaders import TextLoader

loader = TextLoader("speech.txt", encoding="utf-8")
text_documents = loader.load()
text_documents

[Document(metadata={'source': 'speech.txt'}, page_content='Respected citizens, today we gather to celebrate the spirit of unity and progress. Our nation stands at a crossroads, and it is our collective responsibility to move forward with determination and hope. The Indian National Congress has always believed in the values of democracy, secularism, and social justice.\n\nWe promise to work tirelessly for the welfare of every citizen, ensuring equal opportunities and inclusive growth. Let us join hands to build a brighter future for our children, strengthen our institutions, and uphold the dignity of every individual.\n\nTogether, we can overcome challenges and achieve new heights. Jai Hind!')]

In [6]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")


In [16]:
# Web based Loder
from langchain_community.document_loaders import WebBaseLoader
import bs4

# Load, Chunk, and Index the data of the html page
loader = WebBaseLoader(web_paths=("https://lilianweng.github.io/posts/2025-05-01-thinking/",) 
            )

text_documents = loader.load()

text_documents

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2025-05-01-thinking/', 'title': "Why We Think | Lil'Log", 'description': 'Special thanks to John Schulman for a lot of super valuable feedback and direct edits on this post.\nTest time compute (Graves et al. 2016, Ling, et al. 2017, Cobbe et al. 2021) and Chain-of-thought (CoT) (Wei et al. 2022, Nye et al. 2021), have led to significant improvements in model performance, while raising many research questions. This post aims to review recent developments in how to effectively use test-time compute (i.e. “thinking time”) and why it helps.', 'language': 'en'}, page_content='\n\n\n\n\n\nWhy We Think | Lil\'Log\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nLil\'Log\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n|\n\n\n\n\n\n\nPosts\n\n\n\n\nArchive\n\n\n\n\nSearch\n\n\n\n\nTags\n\n\n\n\nFAQ\n\n\n\n\n\n\n\n\n\n      Why We Think\n    \nDate: May 1, 2025  |  Estimated Reading Time: 40 min  |  Author: Lilian

In [6]:
from langchain_community.document_loaders import PyPDFLoader
# Load, Chunk, and Index the data of the pdf file
loader = PyPDFLoader("Sreekanth Pogula.pdf")
text_documents = loader.load()
text_documents

[Document(metadata={'producer': 'Microsoft® Word 2016', 'creator': 'Microsoft® Word 2016', 'creationdate': '2017-11-07T10:08:38+05:30', 'author': 'Sripradha Gulla', 'moddate': '2017-11-07T10:08:38+05:30', 'source': 'Sreekanth Pogula.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='KRA Aspect Metric Target \nTechnology Technologies competence assessment rating (has comprehensive \nunderstating of the technologies and has applied the technologies; \nAchieved assessment score of 70% in Technology Test(s)) \n>= 70% \nSoftware \nEngineering \nEngineering competence assessment rating (is able to analyze, design and \ndevelop software components (both UI and server) to meet requirements; \nAchieved assessment score of 70% in SE Test and Project Work) \n>= 70% \nProblem Solving Problem solving competence assessment rating (is able to solve medium \ncomplexity problems that involve analytical and algorithmic thinking for \ndesign of optimal solution; Achieved assessment scor

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Split the text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
text_chunks = text_splitter.split_documents(text_documents)
text_chunks

[Document(metadata={'producer': 'Microsoft® Word 2016', 'creator': 'Microsoft® Word 2016', 'creationdate': '2017-11-07T10:08:38+05:30', 'author': 'Sripradha Gulla', 'moddate': '2017-11-07T10:08:38+05:30', 'source': 'Sreekanth Pogula.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='KRA Aspect Metric Target \nTechnology Technologies competence assessment rating (has comprehensive \nunderstating of the technologies and has applied the technologies; \nAchieved assessment score of 70% in Technology Test(s)) \n>= 70% \nSoftware \nEngineering \nEngineering competence assessment rating (is able to analyze, design and \ndevelop software components (both UI and server) to meet requirements; \nAchieved assessment score of 70% in SE Test and Project Work) \n>= 70% \nProblem Solving Problem solving competence assessment rating (is able to solve medium \ncomplexity problems that involve analytical and algorithmic thinking for \ndesign of optimal solution; Achieved assessment scor

In [35]:
# Create a vector store from the text chunks
from langchain_community.vectorstores import Chroma
from langchain.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings()
db = Chroma.from_documents(text_chunks, embeddings)

  embeddings = OllamaEmbeddings()


In [37]:
# Create a vector store from the text chunks
query = "What is the main topic of the article?"
docs = db.similarity_search_by_vector(embeddings.embed_query(query), k=1)
docs[0].page_content

'KRA Aspect Metric Target \nTechnology Technologies competence assessment rating (has comprehensive \nunderstating of the technologies and has applied the technologies; \nAchieved assessment score of 70% in Technology Test(s)) \n>= 70% \nSoftware \nEngineering \nEngineering competence assessment rating (is able to analyze, design and \ndevelop software components (both UI and server) to meet requirements; \nAchieved assessment score of 70% in SE Test and Project Work) \n>= 70% \nProblem Solving Problem solving competence assessment rating (is able to solve medium \ncomplexity problems that involve analytical and algorithmic thinking for \ndesign of optimal solution; Achieved assessment score of 70% in Problem \nSolving Hackathon)        \n>= 70% \nCommunication Communication competence assessment rating (is able to listen and \ncommunicate (oral, written, presentation) effectively; Achieved assessment \nscore of 70% in BCE-IC)  \n>= 70%'

In [39]:
## FAISS vector Store
from langchain_community.vectorstores import FAISS
db1 = FAISS.from_documents(text_chunks, OllamaEmbeddings())

# Create a vector store from the text chunks
query = "What is the main topic of the article?"
docs = db1.similarity_search_by_vector(embeddings.embed_query(query), k=3)
docs[0].page_content

'KRA Aspect Metric Target \nTechnology Technologies competence assessment rating (has comprehensive \nunderstating of the technologies and has applied the technologies; \nAchieved assessment score of 70% in Technology Test(s)) \n>= 70% \nSoftware \nEngineering \nEngineering competence assessment rating (is able to analyze, design and \ndevelop software components (both UI and server) to meet requirements; \nAchieved assessment score of 70% in SE Test and Project Work) \n>= 70% \nProblem Solving Problem solving competence assessment rating (is able to solve medium \ncomplexity problems that involve analytical and algorithmic thinking for \ndesign of optimal solution; Achieved assessment score of 70% in Problem \nSolving Hackathon)        \n>= 70% \nCommunication Communication competence assessment rating (is able to listen and \ncommunicate (oral, written, presentation) effectively; Achieved assessment \nscore of 70% in BCE-IC)  \n>= 70%'

In [40]:
from langchain_community.llms import Ollama
llm = Ollama(model="gemma3")
llm



Ollama(model='gemma3')

In [53]:
## Desig the chat template
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template(
    "You are a helpful assistant. Answer the question based on the context provided.\n\nContext: {context}\n\nQuestion: {input}\n\nAnswer:"
)

In [42]:
## chain Introduction
from langchain.chains.combine_documents import create_stuff_documents_chain

chain = create_stuff_documents_chain(
    llm=llm,
    prompt=prompt
)

In [43]:
"""
Retrievers:
- name: "document_retriever"
  type: "vector"
  params:
    embedding_model: "text-embedding-ada-002"
    index: "document_index"
- name: "context_retriever"
  type: "vector"
  params:
    embedding_model: "text-embedding-ada-002"
    index: "context_index"
"""

retriever = db.as_retriever()
retriever

VectorStoreRetriever(tags=['Chroma', 'OllamaEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x12be39a10>, search_kwargs={})

In [48]:
"""# Run the chain with a query"""

from langchain.chains import create_retrieval_chain
retrieval_chain = create_retrieval_chain(
    retriever,
    chain
)

In [55]:
response = retrieval_chain.invoke({
    "context": "The article discusses the advancements in AI and its impact on various industries.",
    "input": "What is the main topic of the article?"
})

In [56]:
response['answer']

'The main topic of the article is tracking progress against Key Results Areas (KRAs) for an associate’s development, including competency assessments, performance reporting, and continuous learning.'