In [23]:
# with pip
%pip install --upgrade --quiet  supabase langchain langchain_community langchain_core langchain_openai pypdf

Note: you may need to restart the kernel to use updated packages.


In [None]:
from langchain_community.vectorstores import SupabaseVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from glob import glob
from dotenv import load_dotenv

# ✅ Load .env variables
load_dotenv()

In [None]:
import os
from supabase import create_client, Client

# ✅ Supabase setup
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)


In [None]:
# ✅ Config
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")  # your OpenAI key

In [15]:
# ✅ Initialize embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [19]:
# ✅ Folder containing PDFs
pdf_folder = "dataset/*.pdf"
pdf_files = glob(pdf_folder)

In [26]:
# ✅ Loop over PDFs
for pdf_path in pdf_files:
    file_name = os.path.basename(pdf_path)
    print(f"📄 Processing: {file_name}")

    # 1. Insert metadata into `documents`
    doc_record = supabase.table("documents").insert({"title": file_name}).execute()
    document_id = doc_record.data[0]["id"]

    # 2. Load and chunk PDF
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()

    # Remove null characters and other problematic control characters
    def clean_text(text):
        return text.replace("\u0000", "").replace("\x00", "")
    
    # 3. Attach document_id to metadata of each chunk
    for d in docs:
        d.page_content = clean_text(d.page_content)  # ✅ clean chunk content
        d.metadata["document_id"] = document_id

    # 4. Store chunks + embeddings
    vector_store = SupabaseVectorStore.from_documents(
        docs,
        embeddings,
        client=supabase,
        table_name="chunks",
        query_name="match_chunks"
    )

print("✅ All PDFs processed and stored in Supabase!")

📄 Processing: 2412.00857v2.pdf
📄 Processing: 2502.11880v1.pdf
📄 Processing: 2504.08791v1.pdf
📄 Processing: 2504.11289v1.pdf
📄 Processing: 2504.12626v2.pdf
📄 Processing: lstm.pdf
📄 Processing: vnet.pdf
✅ All PDFs processed and stored in Supabase!


In [None]:
# ✅ Step 4. Query and Get Answers

In [36]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

retriever = vector_store.as_retriever(search_kwargs={"k": 5})
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",   # Stuff all retrieved docs into context
    return_source_documents=True
)

query = "What is lstm"
result = qa_chain({"query": query})
# Extract the answer
answer = result["result"]
# Extract sources
sources = {doc.metadata.get("source") for doc in result["source_documents"]}
sources = [s for s in sources if s]

print("🤖 Answer:\n", answer)
print("\n📚 Sources Used:")
for s in sources:
    print("-", s)

🤖 Answer:
 LSTM stands for Long Short-Term Memory, which is a type of Recurrent Neural Network (RNN) designed to effectively learn from sequences of data over long periods of time. LSTMs are particularly useful for tasks that involve time series data or sequences, such as speech recognition, handwriting recognition, and machine translation. They address the limitations of standard RNNs, particularly the vanishing and exploding gradient problems, allowing them to retain information over longer time intervals (more than 1,000 timesteps). LSTMs achieve this through special memory cells and gating mechanisms that control the flow of information.

📚 Sources Used:
- dataset\lstm.pdf
