In [15]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_mistralai.chat_models import ChatMistralAI
from langchain.chains import RetrievalQA
from langchain_mistralai import MistralAIEmbeddings
from langchain_community.vectorstores import FAISS
import os, re
from dotenv import load_dotenv
from langchain_core.documents import Document

# Load API key from .env
load_dotenv()
api_key = os.getenv("MISTRAL_API_KEY")

# Initialize LLM
llm = ChatMistralAI(
    api_key=api_key,
    model="mistral-small-latest",
    temperature=0,
)

# Load and process PDF
file_path = "C:\\Users\\shlok\\Desktop\\ML_Referenes\\Python_Datascience.pdf"
loader = PyPDFLoader(file_path)
doc = loader.load()

# Split into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
chunks_of_doc = splitter.split_documents(doc)
print(f"Total chunks created: {len(chunks_of_doc)}")

# Clean encoding
def clean_text(text):
    return text.encode("utf-8", "ignore").decode("utf-8", "ignore")

cleaned_chunks = [
    Document(page_content=clean_text(doc.page_content), metadata=doc.metadata)
    for doc in chunks_of_doc
]

# Embedding + FAISS
embeddings = MistralAIEmbeddings(model="mistral-embed")
vectorstore = FAISS.from_documents(cleaned_chunks, embeddings)

# User Question
question = "what is the work experience"
response = vectorstore.similarity_search(question, k=3)

# Clean answer formatting
def clean_output(text):
    if isinstance(text, list):
        text = " ".join([doc.page_content for doc in text])
    text = re.sub(r'[\n`*•#\-\d]+\.*\s*', ' ', text)
    return re.sub(r'\s+', ' ', text).strip()

# RAG Chain
chain = RetrievalQA.from_chain_type(llm=llm, retriever=vectorstore.as_retriever())

# Get Final Answer from LLM
final_answer = chain.run(question)

# Output
print("\n🧠 Final Clean Answer:\n")
print(clean_output(final_answer))


Total chunks created: 1237





🧠 Final Clean Answer:

Members have access to thousands of books, training videos, Learning Paths, interac‐ tive tutorials, and curated playlists from over publishers, including O’Reilly Media, Harvard Business Review, Prentice Hall Professional, Addison Wesley Profes‐ sional, Microsoft Press, Sams, Que, Peachpit Press, Adobe, Focal Press, Cisco Press, John Wiley & Sons, Syngress, Morgan Kaufmann, IBM Redbooks, Packt, Adobe Press, FT Press, Apress, Manning, New Riders, McGraw Hill, Jones & Bartlett, and Course Technology, among others. For more information, please visit http://oreilly.com/safari. How to Contact Us Please address comments and questions concerning this book to the publisher: O’Reilly Media, Inc. Gravenstein Highway North Sebastopol, CA (in the United States or Canada) (international or local) (fax) We have a web page for this book, where we list errata, examples, and any additional your current area of expertise. Whether you are reporting election results, forecasting s