In [0]:
%pip install pypdf langchain faiss-cpu sentence-transformers

In [0]:
# Databricks Notebook: RAG System from Multiple PPTX Files with System Prompt

## Step 1: Install Required Packages
%pip install -U langchain-community sentence-transformers python-pptx faiss-cpu

## Step 2: Extract Text from PPTX Files
from pptx import Presentation
import os

def extract_text_from_pptx(file_path):
    prs = Presentation(file_path)
    text = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text

pptx_dir = "your PPT stored location"
documents = []
for filename in os.listdir(pptx_dir):
    if filename.endswith(".pptx"):
        full_path = os.path.join(pptx_dir, filename)
        documents.append(extract_text_from_pptx(full_path))

## Step 3: Chunk the Text
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.create_documents(documents)

## Step 4: Embed and Save FAISS Index
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
faiss_store = FAISS.from_documents(docs, embedding_model)

# Save FAISS index to disk
index_dir = "Your index storage"
faiss_store.save_local(index_dir)

## Step 5: Load FAISS Index with safe deserialization
faiss_store = FAISS.load_local(
    folder_path=index_dir,
    embeddings=embedding_model,
    allow_dangerous_deserialization=True  # Use only if the index is trusted
)

## Step 6: Configure LLM with System Prompt
from langchain.chat_models import ChatDatabricks
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.chains import RetrievalQA

# Define your custom system prompt
system_prompt = """You are a helpful IT project Use case creator who extracts insights from corporate presentations.
Be concise, accurate, and avoid repetition. Respond in bullet points where appropriate.Do not make things up, when you dont have context"""

# Prompt template setup
prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(system_prompt),
    HumanMessagePromptTemplate.from_template("Context:\n{context}\n\nQuestion: {question}")
])

llm = ChatDatabricks(
    endpoint="databricks-llama-4-maverick",  
    max_tokens=300
)

# Setup RetrievalQA chain using custom prompt
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=faiss_store.as_retriever(),
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)

## Step 7: Ask Multiple Questions
questions = [
    "Which usecases are you covering?",
    "What is outline of any use cases?",
    "Where customer expectation was met?.",
    "Who is Antony?"
    
]

for q in questions:
    print(f"Q: {q}")
    print("A:", rag_chain.run(q))
    print("------")
