In [5]:
import numpy as np

def retrieve_chunks(query, model, index, chunks, top_k=5):
    # Embed the query
    q_emb = model.encode([query])

    # Search FAISS
    distances, indices = index.search(np.array(q_emb).astype('float32'), top_k)

    # Return actual text chunks
    return [chunks[i] for i in indices[0]]


In [6]:
def build_prompt(question, retrieved_chunks):
    context = "\n\n".join(retrieved_chunks)
    prompt = f"""
You are a college AI tutor.
Answer the question strictly from the provided syllabus context.
If the answer is not present in the context, say: "This topic is not covered in the syllabus."

Context:
{context}

Question:
{question}

Answer clearly and simply:
"""
    return prompt


In [7]:
import requests


def ask_lmstudio(prompt):
    """Call LM Studio's OpenAI-compatible endpoint with basic error handling."""
    payload = {
        "model": "gemma-3-12b-it",  # set your loaded model name in LM Studio
        "messages": [
            {"role": "system", "content": "You are a helpful college tutor."},
            {"role": "user", "content": prompt},
        ],
        "temperature": 0.3,
        "max_tokens": 256,
    }

    response = requests.post(
        "http://localhost:1234/v1/chat/completions",
        json=payload,
        timeout=30,
    )

    # If bad status, show the body for debugging and then raise
    if not response.ok:
        try:
            err_json = response.json()
        except Exception:
            err_json = response.text
        raise RuntimeError(f"LM Studio error {response.status_code}: {err_json}")

    data = response.json()

    # LM Studio should return an OpenAI-like payload with "choices"
    if "choices" not in data or not data["choices"]:
        raise RuntimeError(f"Unexpected response from LM Studio: {data}")

    return data["choices"][0]["message"]["content"]


In [8]:
import json
import faiss

index = faiss.read_index("backend/ml_book.index")
chunks = json.load(open("backend/ml_book_chunks.json"))

In [9]:
from sentence_transformers import SentenceTransformer

# Embedding model used both when building the FAISS index and when encoding new questions
embed_model = SentenceTransformer("all-MiniLM-L6-v2")



  from .autonotebook import tqdm as notebook_tqdm


In [10]:
def answer_question(question):
    retrieved = retrieve_chunks(question, embed_model, index, chunks)
    prompt = build_prompt(question, retrieved)
    answer = ask_lmstudio(prompt)
    return answer


In [11]:
answer_question("What is Bayesian inference?")

RuntimeError: LM Studio error 400: {'error': 'Trying to keep the first 4163 tokens when context the overflows. However, the model is loaded with context length of only 4096 tokens, which is not enough. Try to load the model with a larger context length, or provide a shorter input'}