In [None]:
!pip install faiss-cpu

In [None]:
from langchain.document_loaders import PyPDFLoader  # or PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import pickle

# Step 1: Load PDF with LangChain
pdf_path = "Online_Statistics_Education.pdf"
# loader = UnstructuredPDFLoader(pdf_path)  # Alternative: PyPDFLoader(pdf_path)
# documents = loader.load()
loader = PyPDFLoader("Online_Statistics_Education.pdf")
documents = loader.load()

# Step 2: Chunk documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
)
chunks = text_splitter.split_documents(documents)

# Step 3: Prepare texts
texts = [chunk.page_content for chunk in chunks]

# Step 4: Embed with SentenceTransformers
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(texts)

# Step 5: Store in FAISS
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Save index and chunks
faiss.write_index(index, "faiss_statistics.index")
with open("statistics_chunks.pkl", "wb") as f:
    pickle.dump(texts, f)

print(f"Stored {len(texts)} chunks in FAISS.")

In [None]:
# Load index and chunks
index = faiss.read_index("faiss_statistics.index")
with open("statistics_chunks.pkl", "rb") as f:
    texts = pickle.load(f)

query = "What is normal distribution?"
query_embedding = model.encode([query])
D, I = index.search(query_embedding, k=3)

# Print results
for idx in I[0]:
    print(texts[idx])
    print("-" * 40)

In [None]:
import openai
# FAISS + embeddings + chunks are already loaded
query = "Explain standard deviation in simple terms"
query_embedding = model.encode([query])

# Retrieve top-3 matching chunks
D, I = index.search(query_embedding, k=3)
retrieved_chunks = [texts[i] for i in I[0]]

# Combine into a context prompt
context = "\n\n".join(retrieved_chunks)

# Build final prompt
prompt = f"""You are a helpful tutor. Based on the following course content, answer the user's question in a clear and concise way.

Context:
{context}

Question: {query}
Answer:"""

# Run the LLM query
api_key = os.getenv("OPENAI_API_KEY")
    
if not api_key:
    raise ValueError("API key is missing. Set the 'OPENAI_API_KEY' environment variable.")

client = openai.OpenAI(api_key=api_key)
response = client.chat.completions.create(
    model="gpt-4o-mini",  # or "gpt-4"
    messages=[
        {"role": "system", "content": "You are a helpful AI assistant for statistics students."},
        {"role": "user", "content": prompt}
    ],
    temperature=0.3,
)

# Print response
print(response.choices[0].message.content)