In [19]:
import os, pathlib
from dotenv import load_dotenv, dotenv_values

# move up from /research → project root
if pathlib.Path.cwd().name == "research":
    os.chdir("..")

# load keys from .env (override any old values)
load_dotenv(override=True)

# sanity print
vals = dotenv_values(".env")
print("Project root:", pathlib.Path.cwd())
print("Keys in .env:", list(vals.keys()))
print("Has PINECONE_API_KEY?", bool(os.getenv("PINECONE_API_KEY")))
print("Has OPENROUTER_API_KEY?", bool(os.getenv("OPENROUTER_API_KEY")))

Project root: /Users/lathifmohammadshaik/Desktop/sutent gpt/Buliding-a-complete-subject-chatbot-with-LLMs-LangChain-Pinecone-Flask-AWS
Keys in .env: ['PINECONE_API_KEY', 'OPENROUTER_API_KEY']
Has PINECONE_API_KEY? True
Has OPENROUTER_API_KEY? True


In [4]:
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader

def load_pdf_files(data_dir="data"):
    loader = DirectoryLoader(data_dir, glob="*.pdf", loader_cls=PyPDFLoader)
    docs = loader.load()
    print(f"Loaded {len(docs)} pages")
    return docs

raw_docs = load_pdf_files("data")

Loaded 834 pages


In [5]:
from typing import List
from langchain_core.documents import Document

def to_minimal_docs(docs: List[Document]) -> List[Document]:
    out = []
    for d in docs:
        meta = d.metadata or {}
        src = meta.get("source") or meta.get("file_path") or "unknown"
        out.append(Document(page_content=d.page_content, metadata={"source": src}))
    return out

minimal_docs = to_minimal_docs(raw_docs)
print("Sample source:", minimal_docs[0].metadata if minimal_docs else {})

Sample source: {'source': 'data/CyBOK-version-1.0.pdf'}


In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=20,
    length_function=len
)
texts_chunk = splitter.split_documents(minimal_docs)
print(f"Split completed: {len(texts_chunk)} chunks created.")

Split completed: 7178 chunks created.


In [7]:
from langchain_community.embeddings import HuggingFaceEmbeddings

EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # 384-dim
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)

# quick smoke test
vec = embeddings.embed_query("hello world")
print("Vector length:", len(vec))  # should be 384

  embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)


Vector length: 384


In [8]:
import os
from pinecone import Pinecone, ServerlessSpec

if not os.getenv("PINECONE_API_KEY"):
    raise RuntimeError("No PINECONE_API_KEY in env.")

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

INDEX_NAME = "subject-chatbot"
DIMENSION  = 384
METRIC     = "cosine"

if not pc.has_index(INDEX_NAME):
    pc.create_index(
        name=INDEX_NAME,
        dimension=DIMENSION,
        metric=METRIC,
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    print("✅ Created Pinecone index:", INDEX_NAME)
else:
    print("✅ Using existing Pinecone index:", INDEX_NAME)

index = pc.Index(INDEX_NAME)
print("✅ Connected to index:", INDEX_NAME)

✅ Using existing Pinecone index: subject-chatbot
✅ Connected to index: subject-chatbot


In [9]:
import os
from dotenv import load_dotenv, dotenv_values

load_dotenv(override=True)
vals = dotenv_values(".env")
print("Has Pinecone key:", bool(os.getenv("PINECONE_API_KEY")))
print("Has Groq key:", bool(os.getenv("GROQ_API_KEY")))

Has Pinecone key: True
Has Groq key: True


In [23]:
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader

def load_pdf_files(data_dir="data"):
    loader = DirectoryLoader(
        data_dir,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    docs = loader.load()
    print(f"Loaded {len(docs)} pages")
    return docs

raw_docs = load_pdf_files()
assert raw_docs, "No PDFs found in ./data"

Loaded 834 pages


In [12]:
from typing import List
from langchain_core.documents import Document

def to_minimal_docs(docs: List[Document]) -> List[Document]:
    out = []
    for d in docs:
        meta = d.metadata or {}
        src  = meta.get("source") or meta.get("file_path") or "unknown"
        out.append(Document(page_content=d.page_content, metadata={"source": src}))
    print("Minimized:", len(out))
    return out

minimal_docs = to_minimal_docs(raw_docs)

Minimized: 834


In [13]:
from typing import List
from langchain_core.documents import Document

def to_minimal_docs(docs: List[Document]) -> List[Document]:
    out = []
    for d in docs:
        meta = d.metadata or {}
        src  = meta.get("source") or meta.get("file_path") or "unknown"
        out.append(Document(page_content=d.page_content, metadata={"source": src}))
    print("Minimized:", len(out))
    return out

minimal_docs = to_minimal_docs(raw_docs)

Minimized: 834


In [14]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
texts_chunk = splitter.split_documents(minimal_docs)
print("Chunks:", len(texts_chunk))

Chunks: 7178


In [15]:
from langchain_community.embeddings import HuggingFaceEmbeddings

EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # 384-dim
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)

# quick smoke test
vec = embeddings.embed_query("hello world")
print("Vector length:", len(vec))  # should be 384

Vector length: 384


In [16]:
from langchain_pinecone import PineconeVectorStore

INDEX_NAME = "subject-chatbot"  # same name you created
vectorstore = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embeddings,
    index_name=INDEX_NAME
)
print(f"Upserted {len(texts_chunk)} chunks into index: {INDEX_NAME}")

Upserted 7178 chunks into index: subject-chatbot


In [17]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

query = "Give a short overview of the document's main subject."
docs = retriever.invoke(query)  # .invoke in LC v0.2+
print(f"Retrieved {len(docs)} chunks.")
for i, d in enumerate(docs, 1):
    src = d.metadata.get("source", "unknown")
    preview = d.page_content[:120].replace("\n", " ")
    print(f"{i:>2}. source={src} | preview={preview}…")

Retrieved 3 chunks.
 1. source=data/CyBOK-version-1.0.pdf | preview=The table below lists the reference material that serves as the basis for for this chapter and explains how it relates t…
 2. source=../data/CyBOK-version-1.0.pdf | preview=The table below lists the reference material that serves as the basis for for this chapter and explains how it relates t…
 3. source=../data/CyBOK-version-1.0.pdf | preview=much of this knowledge area’s content will be novel to those whose education is based in sci- ence, technology, engineer…


In [18]:
from langchain_openai import ChatOpenAI
import os

# --- OpenRouter DeepSeek model ---
if not os.getenv("OPENROUTER_API_KEY"):
    raise RuntimeError("No OPENROUTER_API_KEY in env. Add it to your .env file.")

chat_llm = ChatOpenAI(
    model="tngtech/deepseek-r1t2-chimera:free",   # OpenRouter model slug
    base_url="https://openrouter.ai/api/v1",      # Required for OpenRouter
    api_key=os.getenv("OPENROUTER_API_KEY"),
    temperature=0
)

print("OpenRouter DeepSeek LLM ready.")

OpenRouter DeepSeek LLM ready.


In [20]:
# LangChain v1.x RAG (works with 1.0.5)

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# 1) Build a retriever from your existing vectorstore
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

# 2) Helper: format retrieved docs into a single context string
def format_docs(docs):
    return "\n\n".join(d.page_content for d in docs)

# 3) Prompt
system_prompt = (
    "You are a concise QA assistant.\n"
    "Use the provided context to answer the user's question.\n"
    "If you don't know, say you don't know. Keep answers short.\n\n"
    "{context}"
)
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])

# 4) Compose the RAG pipeline
#    input -> {"context": retriever(input) -> format_docs, "input": passthrough} -> prompt -> LLM -> text
rag_chain = (
    {
        "context": retriever | format_docs,
        "input": RunnablePassthrough(),
    }
    | prompt
    | chat_llm         # you already set this earlier (OpenAI or Groq)
    | StrOutputParser()
)

print("✅ RAG chain ready (v1.x).")

✅ RAG chain ready (v1.x).


In [21]:
# ✅ Correct test call for LangChain v1.x RAG
question = "Give a short overview of the document’s main subject."
answer = rag_chain.invoke(question)

print("— ANSWER —\n", answer)

— ANSWER —
 
The document's main subject is **Physical Layer Security and Telecommunications**. It serves as a reference guide for a chapter, organizing material by topics and sub-topics to explain their relevance.
