In [37]:
!pip install --upgrade pip
!pip install langchain_huggingface sentence-transformers langchain chromadb pypdf faiss-cpu  langchain_community scikit-learn matplotlib seaborn numpy mistralai langchain-mistralai langchain_classic

Collecting langchain_huggingface
  Downloading langchain_huggingface-1.2.0-py3-none-any.whl.metadata (2.8 kB)
Downloading langchain_huggingface-1.2.0-py3-none-any.whl (30 kB)
Installing collected packages: langchain_huggingface
Successfully installed langchain_huggingface-1.2.0


In [47]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
# from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS,Chroma
from langchain_classic.schema import Document
import os


loader = PyPDFDirectoryLoader("./dataset/",glob="*.pdf")
docs = loader.load()


splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(docs)
print(f"Loaded {len(docs)} docs, split into {len(chunks)} chunks")
# inspect one
print(chunks)

Loaded 109 docs, split into 163 chunks
[Document(metadata={'producer': 'Microsoft® PowerPoint® LTSC', 'creator': 'Microsoft® PowerPoint® LTSC', 'creationdate': '2025-12-09T20:37:37+05:30', 'title': '', 'author': 'Bhuvan Chandra Mothe', 'subject': '', 'moddate': '2025-12-09T20:37:37+05:30', 'source': 'dataset/Day-8-chunking.pdf', 'total_pages': 20, 'page': 0, 'page_label': '1'}, page_content='preencoded.png\nDay 8\nIngestion & Chunking'), Document(metadata={'producer': 'Microsoft® PowerPoint® LTSC', 'creator': 'Microsoft® PowerPoint® LTSC', 'creationdate': '2025-12-09T20:37:37+05:30', 'title': '', 'author': 'Bhuvan Chandra Mothe', 'subject': '', 'moddate': '2025-12-09T20:37:37+05:30', 'source': 'dataset/Day-8-chunking.pdf', 'total_pages': 20, 'page': 1, 'page_label': '2'}, page_content='preencoded.png\nThe Art of Chunking: \nOptimizing RAG \nArchitecture\nUnderstanding how to break down information is crucial for \nefficient Retrieval Augmented Generation (RAG) systems. This \npresentat

In [48]:
print("Creating embeddings... this may take a moment.")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

Creating embeddings... this may take a moment.


In [49]:
vectorstore = FAISS.from_documents(chunks, embeddings)

In [50]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

In [51]:
from langchain_mistralai import ChatMistralAI

llm = ChatMistralAI(model="mistral-medium")

In [52]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_classic.chains import create_retrieval_chain

prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}
""")

In [53]:
document_chain = create_stuff_documents_chain(llm, prompt)

In [54]:
rag_chain = create_retrieval_chain(retriever, document_chain)

In [2]:
import os
from getpass import getpass

if not os.getenv("MISTRAL_API_KEY"):
    os.environ["MISTRAL_API_KEY"] = getpass("Enter your Mistral API Key: ")

Enter your Mistral API Key: ··········


In [55]:
user_query = "What is llm ?"
response = rag_chain.invoke({"input": user_query})

print("\n--- RESPONSE ---")
print(response["answer"])


--- RESPONSE ---
Based on the provided context, **LLM** stands for **Large Language Model**, a type of AI system that is being used to transform industries and tasks by enabling applications like:

- **Chatbots & Assistants** (customer service, virtual helpers)
- **Coding Assistants** (code generation, debugging)
- **Summarization** (condensing long texts)
- **PDF Extraction** (pulling data from unstructured documents)
- **Agents & Workflows** (automating multi-step processes).
