Data Ingestion

In [None]:
from langchain_community.document_loaders import TextLoader 
loader=TextLoader("speech.txt")
text_documents=loader.load()
text_documents 

In [None]:
import os
from dotenv import load_dotenv 
load_dotenv()

os.environ['OPENAI_API_KEY']=os.getenv("OPENAI_API_KEY")


Web Based Loader

The error message "USER_AGENT environment variable not set" means that the request made by WebBaseLoader is being blocked because it does not include a User-Agent header (many websites block requests without it for security reasons).

In Git Bash or Terminal, run:
export USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; ....."

This tells the website:

The request is coming from Google Chrome (v120)

Running on Windows 10 (64-bit)

Using AppleWebKit (browser engine)


In [None]:
from langchain_community.document_loaders import WebBaseLoader 
import bs4 # (BeautifulSoup 4) is a Python library used for web scraping and parsing HTML and XML

# Set the User-Agent environment variable
os.environ["USER_AGENT"] = os.getenv("USER_AGENT")
loader=WebBaseLoader(web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
                    bs_kwargs=dict(parse_only=bs4.SoupStrainer(
                        class_=("post-title","post-content","post-header")
                    )))
text_documents=loader.load()
text_documents

In [None]:
from langchain_community.document_loaders import PyPDFLoader 
loader=PyPDFLoader('attention-is-all-you-need.pdf')
docs=loader.load()

In [None]:
docs

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter 
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
documents=text_splitter.split_documents(docs)
documents[:5]

In [None]:
len(documents)

In [None]:
# ## Vector Embedding And Vector Store
# from langchain_openai import OpenAIEmbeddings
# from langchain_community.vectorstores import Chroma
# db = Chroma.from_documents(documents,OpenAIEmbeddings())

📌 Which Model to Use? (OllamaEmbeddings model)

Best for General Use → nomic-embed-text

Smaller, Faster Model → mistral

Latest LLaMA Model → llama3

Google’s Open Model → gemma

Sentence Transformers → all-MiniLM-L6-v2

In [None]:
from langchain_community.embeddings import OllamaEmbeddings 
embeddings=OllamaEmbeddings(model="llama3.2")

In [None]:

from langchain_community.vectorstores import Chroma 



db=Chroma.from_documents(documents[:8],embeddings)

In [None]:
query="Who are the authors of attention is all you need?"
retrieved_results=db.similarity_search(query)
print(retrieved_results[0].page_content)

In [None]:
from langchain_community.vectorstores import FAISS 
db=FAISS.from_documents(documents[:10],embeddings)

In [None]:
query="what is attention?"
retrieved_results=db.similarity_search(query)
print(retrieved_results[0].page_content)

In [None]:
from langchain_community.llms import Ollama 

llm=Ollama(model="gemma3")
llm

In [None]:
from langchain_core.prompts import ChatPromptTemplate 

prompt=ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context.
Think step by step before providing a detailed answer.
<context>
{context}
</context>
Question: {input}
""")

In [None]:
from langchain.chains.combine_documents import create_stuff_documents_chain 

document_chain=create_stuff_documents_chain(llm,prompt)

In [None]:
"""
Retrievers: A retriever is an interface that returns documents given an
unstructured query. It is more general than a vector store. A retriever does
not need to be able to store documents, only to return(or retrieve) them. vector stores can be used 
used as the backbone of a retriever, but there are other types of retrievers as well. 
"""

retriever=db.as_retriever()
retriever 

In [None]:
"""
Retriever chain : This chain take a user inquiry, which is then passed it to the 
retriever to fetch relevant documents, Those documents are then passed to an llm to generate
a response
"""
from langchain.chains import create_retrieval_chain
retrieval_chain=create_retrieval_chain(retriever, document_chain)

In [None]:
response=retrieval_chain.invoke({"input":"what is attention function?"})

In [None]:
response['answer']